07_analysis3

01_load

Libraries

library("here")
library("tidyverse")

Download data

The data and _raw directories are only created if they do not already exist.

source("99_proj_func.R")
create_directory(here("data"))
create_directory(here("data", "_raw"))
if(!file.exists(here("data", "_raw", "GSE41080_RAW.tar"))){
  curl::curl_download(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE41080&format=file", 
                      destfile = here("data", "_raw", "GSE41080_RAW.tar"))
}

Expression

untar(here("data", "_raw", "GSE41080_RAW.tar"), 
      exdir = here("data", "expression"))
expression_files <- list.files(here("data", "expression"))

The first file in expression_files is a reference file for the probe IDs in the other files. We do not use this file and as a result it is not read. The column names are added manually and the first line skipped because column names in the files contain the general column name plus a number that is different.

data_expression <- read_delim(here("data", "expression", expression_files[2:92]), 
                              id = "file", 
                              col_names = c("PROBE_ID",
                                            "NBEADS1",
                                            "Signal",
                                            "Pval",
                                            "NARRAYS",
                                            "ARRAY_STDEV",
                                            "BEAD_STDERR",
                                            "NBEADS2",
                                            "SEARCH_KEY"), 
                              skip = 1,
                              na = "NaN") |> 
  mutate(file = str_extract(file, "GSM[0-9]+"))
write_tsv(data_expression, here("data", "01_expression_dat_load.tsv.gz"))

Meta data

The rows after row 89 contain no information and are as a result not included

metadata <- read_delim(here("data", "_raw", "msb201315-s2.csv"), 
                       delim = ",", 
                       na = "NA" , 
                       n_max= 89)
write_tsv(metadata, here("data", "01_meta_dat_load.tsv.gz"))

02_clean

Libraries

library("here")
library("tidyverse")

Expression data

For the analysis only the probe ID and signal is needed. The p-value is used to choose which probes to work with. The file identifier is needed to combine the expression data and the metadata. Two people did not complete the trial and as a result there is no metadata for them so their expression data is removed.

data_expression <- read_tsv(here("data", "01_expression_dat_load.tsv.gz")) |> 
  select(file, PROBE_ID, Signal, Pval) |> 
  filter(file != "GSM1008315" & file != "GSM1008340")

For the analysis only probes with an average p-value under 0.05 are used

data_expression <- data_expression |>
  group_by(PROBE_ID) |> 
  nest() |> 
  mutate(Avg_Pval = map_dbl(.x = data, 
                            .f = ~mean(pull(.x, Pval)))) |> 
  unnest(cols = c(data)) |> 
  ungroup() |>
  filter(Avg_Pval < 0.05) |> 
  select(-Avg_Pval)
write_tsv(data_expression, here("data", "02_expression_dat_clean.tsv.gz"))

Metadata

metadata <- read_tsv(here("data", "01_meta_dat_load.tsv.gz"))

metadata <- metadata |> 
  select(-starts_with("Season"),
        -starts_with("VacType"),
        -Prior_vac,
        -VACCOUNT,
        -FLU_EVR,
        -FLU_HOSP)

write_tsv(metadata, here("data", "02_meta_dat_clean.tsv"))

03_augment

library("tidyverse")
library("here")

Load the data from the final files.

metadata <- read_tsv(here("data", "02_meta_dat_clean.tsv"))
analysis_data <- read_tsv(here("data", "02_expression_dat_clean.tsv.gz"))

In order to create the final dataset, we need to pivot wider the data with mRNA expression using as row names the sample IDs, column names will be the Probe IDs and the values are the signals for each Probe.

Pval <- analysis_data$Pval

analysis_data <- analysis_data |> 
  pivot_wider(id_cols = file,         
              names_from = PROBE_ID,  
              values_from = Signal) 

We, also, need to connect the two datasets by the IDs.

mapping_key <- analysis_data |> 
  select(file) |>
  distinct()

metadata <- metadata |> 
  mutate(mapping_key) |> 
  select(-xID)

Now we can connect the two datasets using as key to join the mapping_key created.

analysis_data <- metadata |> 
  full_join(analysis_data, by = "file") |> 
  relocate("file") 

analysis_data <- analysis_data |>
  rename(EBV_pos = "EBV (1 = pos)",
         CMV_pos = "CMV (1= pos)")

analysis_data <- analysis_data |>
  mutate(
    Age_Group = case_when(
      Age >= 20 & Age <= 30 ~ "Young",
      Age >= 61 ~ "Older"),
    Cytomegalovirus = case_when(
      CMV_pos == 1 ~ "Positive",
      CMV_pos == 0 ~ "Negative"),
    EpsteinBarrvirus = case_when(
      EBV_pos == 1 ~ "Positive",
      EBV_pos == 0 ~ "Negative")) |> 
  relocate(Age_Group, .after = Age) |> 
  relocate(Cytomegalovirus, .after = BMI) |> 
  relocate(EpsteinBarrvirus, .after = BMI ) |> 
  select(-`CMV_pos`, -EBV_pos) 

The code checks whether each person had a 4-fold antibody increase for the three flu strains, counts how many strains they responded to, and labels them as poor or good responders. It then reorganizes these new columns and summarizes response rates by age group and number of strains for plotting.

analysis_data <- analysis_data |>
  mutate(
    H1N1_fold = (H1N1v3 / H1N1v1) >= 4,
    B_fold    = (Bv3 / Bv1) >= 4,
    H3N2_fold = (H3N2v3 / H3N2v1) >= 4,
    Strains = H1N1_fold + B_fold + H3N2_fold,
    Response_group = if_else(Strains <= 1, "Poor responders", "Good responders")
  )

analysis_data <- analysis_data |>
  relocate(H1N1_fold, B_fold, H3N2_fold, Strains, Response_group,
           .after = Cytomegalovirus)
write_tsv(analysis_data, here("data", "03_expression_dat_aug.tsv.gz"))

rm(list = ls())

04_describe

Loading the libraries we will need

library("table1")
library("tidyverse")
library("here")
source("99_proj_func.R")

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)

Loading the dataset we are going to analyse.

analysis_data <- read_tsv(here("data", "03_expression_dat_aug.tsv.gz"))

To describe the dataset, we are making two tables. The first table shows the average age and BMI of the two groups, Old and young, as well as the range of each measure in both groups. The second table shows the distribution of each group in the three variables: Gender, Cytomegalovirus and Epstein-Barr virus (positive/negative).

numeric_summary <- analysis_data |>
  group_by(Age_Group) |>
  summarize(Age_Median_Range = str_c(
      round(median(Age, na.rm = TRUE), 1), 
      " (", 
      min(Age, na.rm = TRUE), 
      "–", 
      max(Age, na.rm = TRUE), 
      ")"),
    BMI_Median_Range = str_c(
      round(median(BMI, na.rm = TRUE), 1), 
      " (", 
      min(BMI, na.rm = TRUE), 
      "–", 
      max(BMI, na.rm = TRUE), 
      ")"),
    .groups = 'drop')

gender_table <- get_categorical_summary(analysis_data, Gender) |> 
  mutate(Variable = "Gender")
cmv_table <- get_categorical_summary(analysis_data, Cytomegalovirus) |> 
  mutate(Variable = "Cytomegalovirus")
ebv_table <- get_categorical_summary(analysis_data, EpsteinBarrvirus) |> 
  mutate(Variable = "EpsteinBarrvirus")

categorical_summary <- bind_rows(gender_table, cmv_table, ebv_table)

final_categorical_table <- categorical_summary |>
  pivot_wider(names_from = Age_Group,
              values_from = Value,
              id_cols = c(Variable, Characteristic),
              names_sort = TRUE)

print(numeric_summary)
# A tibble: 2 × 3
  Age_Group Age_Median_Range BMI_Median_Range
  <chr>     <chr>            <chr>           
1 Older     78 (61–93)       25.1 (18–47.3)  
2 Young     24 (20–30)       22.9 (18.8–43.6)
print(final_categorical_table)
# A tibble: 6 × 4
  Variable         Characteristic Older    Young   
  <chr>            <chr>          <chr>    <chr>   
1 Gender           Female         40 (67%) 14 (48%)
2 Gender           Male           20 (33%) 15 (52%)
3 Cytomegalovirus  Negative       24 (40%) 13 (45%)
4 Cytomegalovirus  Positive       36 (60%) 16 (55%)
5 EpsteinBarrvirus Negative       19 (32%) 13 (45%)
6 EpsteinBarrvirus Positive       41 (68%) 16 (55%)

05_analysis1

Libraries

library("tidyverse")
library("here")
library("broom")
library("patchwork")
library("ggridges")

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)

PCA

A PCA on just the signals from the probes is done to see if it can show a connection between the signals pre vaccine and the response to the vaccine. The PCA is done with inspiration from “PCA tidyverse style” by Claus O. Wilke

analysis_data <- read_tsv(here("data", "03_expression_dat_aug.tsv.gz"))
pca <- analysis_data |> 
  select(starts_with("ILMN_")) |> 
  scale() |> 
  prcomp()

Variance explained by principal components

pca_var <- pca |> 
  tidy("eigenvalues")

p1 <- pca_var |> 
  ggplot(aes(x = PC, 
             y = percent,
             fill = "#ED6F58",
             color = "#ED6F58")) +
  geom_col() +
  labs(title = "Individual",
       subtitle = "All PCs",
       y = "Explained Variance") +
  theme_minimal() +
  theme(legend.position = "none")

p2 <- pca_var |> 
  filter(PC < 6) |> 
  ggplot(aes(x = PC, 
             y = percent,
             fill = "#ED6F58",
             color = "#ED6F58")) +
  geom_col() +
  labs(title = "Individual",
       subtitle = "5 first PCs",
       y = "Explained Variance") +
  theme_minimal() +
  theme(legend.position = "none")

p3 <- pca_var |> 
  ggplot(aes(x = PC, 
             y = cumulative)) +
  geom_line() +
  geom_point(color = "#ED6F58") +
  labs(title = "Cumulative",
       y = "Explained Variance") +
  scale_y_continuous(breaks = seq(from = 0.4,
                                  to = 1,
                                  by = 0.1)) +
  theme_minimal() +
  theme(legend.position = "none")

p1 / (p2 + p3)

ggsave(filename = "05_variance_plot.png",
       path = here("results"))

The cumulative plot shows that the first 5 principal components explain over 70% of the variance. As a result they are examined to see if they can be used to separate good and poor responders.

Data in principal component coordinates

First the data in the coordinates for the 5 first principal components is split into good or poor response to find the 2 principal components that are most likely to separate them.

pca |> 
  augment(analysis_data) |> 
  pivot_longer(cols = starts_with(".fitted"),
               names_to = "PC",
               values_to = "Coordinates") |> 
  mutate(PC = as.numeric(str_extract(PC,
                                     "[0-9]+"))) |> 
  filter(PC < 6) |> 
  mutate(PC = factor(PC)) |> 
  ggplot(aes(x = Coordinates, 
             y = PC, 
             fill = Response_group,
             color = Response_group)) +
  geom_density_ridges(alpha = 0.5,
                      scale = 1.2) +
  scale_fill_manual(values = c("#E69F00", "#0072B2")) +
  scale_color_manual(values = c("#E69F00", "#0072B2")) +
  theme_minimal() +
  theme(legend.position = "bottom",
        legend.title = element_blank())
Picking joint bandwidth of 14.3

ggsave(filename = "05_1d_plot.png",
       path = here("results"))

The plot shows that none of the 5 first principal componets separate the good and poor responders well. Principal component 2 and 3 are chosen as they show most separation and maybe together can separate the type of response better.

pca |> 
  augment(analysis_data) |> 
  rename_with(.cols = starts_with(".fitted"),
              .fn = ~str_extract(.x,
                                 "PC[0-9]+")) |> 
  ggplot(aes(x = PC2, 
             y = PC3, 
             color = Response_group)) +
  geom_point(size = 3) +
  scale_color_manual(values = c("#E69F00", "#0072B2")) +
  theme_minimal() +
  theme(legend.title = element_blank(),
        legend.box.background = element_rect(color = "lightgrey"))

ggsave(filename = "05_2d_plot.png",
       path = here("results"))

The plot shows no separation so the PCA does not show a connection between the signals pre vaccine and the response to it.

06_analysis2

To examine the connection between poor or good response to the vaccine and some of the metadata the figures in figure 2 in the paper are recreated

Plot A:

library("tidyverse")
library("here")

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
analysis_data <- read_tsv(here("data", "03_expression_dat_aug.tsv.gz"))
plot_data <- analysis_data |>
  group_by(Age_Group, Strains) |>
  summarise(n = n(), .groups = "drop") |>
  group_by(Age_Group) |>
  mutate(rate = 100 * n / sum(n)) |>
  ungroup() |>
  mutate(Strains = factor(Strains, levels = 0:3))
ggplot(plot_data,
       aes(x = as.numeric(as.character(Strains)),
           y = rate,
           fill = Age_Group)) +
  geom_col(position = "dodge", colour = "black", width = 0.7) +
  scale_x_continuous(breaks = 0:3, labels = 0:3) +
  scale_fill_manual(values = c("Older" = "white", "Young" = "black")) +
  geom_vline(xintercept = 1.5, linetype = "dashed") +
  labs(x = "Number of strains",
       y = "Seroconversion rate (%)",
       fill = NULL) +
  theme_bw() +
  theme(panel.grid = element_blank())

If only 0 or 1 of the strains had a 4-fold antibody increase they were labeled poor responders (left of the dotted line). The good responders were those where 2 or 3 of the strains had a 4-fold antibody increase (right of the dotted line).

ggsave(filename = "06_A_plot.png",
       path = here("results"))

Plot B:

plot_b_data <- analysis_data |>
  count(Age_Group, Response_group) |>
  group_by(Age_Group) |>
  mutate(percent = 100 * n / sum(n)) |>
  ungroup() |>
  mutate(
    Age_Group = factor(Age_Group, levels = c("Young", "Older")),
    Response_group = factor(Response_group, levels = c("Poor responders", "Good responders"))  
  )

ggplot(plot_b_data, aes(y = Age_Group, x = percent, fill = Response_group)) +
  geom_col(position = "fill", colour = "black", width = 0.6) +
  scale_x_continuous(labels = scales::percent_format()) +
  scale_fill_manual(
    values = c("Poor responders"="white", "Good responders" = "black"),
    labels = c("PR", "GR")
  ) +
  labs(x = "Percent response", y = NULL, fill = NULL) +
  theme_bw() +
  theme(panel.grid = element_blank())

There is larger percent of the young age group that are good responders than in the older age group. The color of the plot makes it difficult to see that the white is also a group so it is improved with color.

ggsave(filename = "06_B1_plot.png",
       path = here("results"))

Improved version of plot B:

plot_b_data <- analysis_data |>
  count(Age_Group, Response_group) |>
  group_by(Age_Group) |>
  mutate(percent = 100 * n / sum(n)) |>
  ungroup() |>
  mutate(
    Age_Group = factor(Age_Group, levels = c("Young", "Older")),
    Response_group = factor(Response_group, levels = c("Poor responders", "Good responders"))  
  )

ggplot(plot_b_data, aes(y = Age_Group, x = percent, fill = Response_group)) +
  geom_col(position = "fill", colour = "black", width = 0.6) +
  scale_x_continuous(labels = scales::percent_format()) +
  scale_fill_manual(
    values = c("Poor responders"="#0072B2", "Good responders" = "#E69F00"),
    labels = c("PR", "GR")
  ) +
  labs(x = "Percent response", y = NULL, fill = NULL) +
  theme_bw() +
  theme(panel.grid = element_blank())

ggsave(filename = "06_B2_plot.png",
       path = here("results"))

Plot C:

plot_c_data <- analysis_data |>
  mutate(pre_GMT = exp(rowMeans(log(cbind(H1N1v1, Bv1, H3N2v1)))),
         log_pre_GMT = log(pre_GMT))

ggplot(plot_c_data, aes(x = Age_Group, y = log_pre_GMT)) +
  geom_boxplot(fill = "white", color = "black") +
  labs(x = NULL, y = "log(pre-GMT)") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        legend.position = "none")

The geometric mean titer pre vaccine is higher in the young age group than in the older

ggsave(filename = "06_C_plot.png",
       path = here("results"))

07_analysis3

Load libraries and data

library("tidyverse")
library("here")
library("purrr")
library("broom")
library("ggrepel")
source("99_proj_func.R")

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
analysis_data <- read_tsv(here("data", "03_expression_dat_aug.tsv.gz"))

In this analysis, we are going to use the Probes signal prior to vaccination from all candidates to find possible biomarkers that predict the response to the vaccine.

First step, is to make a long table of our data

analysis_data_long <- analysis_data |>
  select(file, Response_group, starts_with("ILMN_")) |> 
  pivot_longer(cols = !c(file, Response_group),
              names_to = "PROBE_ID",
              values_to = "log2_signal")

Then, we apply T-test to the signal data for each probe comparing the Good responders to the Poor responders. Since we have the Fold Change between the 2 groups, we can now apply the tidy() formula to calculate the Pvalue, and from that the Pvalue adjusted.

analysis_data_final <- analysis_data_long |> 
  group_by(PROBE_ID) |> 
  nest() |> 
  mutate(
    ttest_results = map(data, ~ t.test(log2_signal ~ Response_group, data = .x, var.equal = FALSE)),
    tidy_results = map(ttest_results, tidy)) |> 
  unnest(tidy_results) |> 
  ungroup() |> 
  mutate(p.adj = p.adjust(p.value, method = "BH")) |> 
  mutate(log2_FC = estimate1 - estimate2 ) |> 
  select(PROBE_ID, log2_FC, p.value, p.adj) |> 
  arrange(p.adj)

Let’s visualize our results with a volcano plot.

analysis_data_final |> 
ggplot(aes(x = log2_FC, y = -log10(p.adj))) +
  geom_point(aes(alpha = 0.05), color = "salmon", position="dodge") +
  geom_text_repel(
    data = analysis_data_final |> 
           filter(p.adj < 0.03, abs(log2_FC) > 1.5),
    aes(label = PROBE_ID),
    size = 2.5,
    color = "turquoise3",
    max.overlaps = 50,
    nudge_y = 0.1) +
  scale_y_continuous(limits = c(0, max(-log10(analysis_data_final |> pull(p.adj)), na.rm = TRUE) * 1.25)) +
  labs(
    title = "Probes predicting the vaccine response",
    subtitle = "Probes highlighted in turquoise were significant after multiple testing correction",
    x = expression(log[2]~"Fold Change (Good / Poor Responder)"),
    y = expression(-log[10]~"Adjusted p-value (FDR)"),
    caption = "Data from GSE41080") +
  theme_minimal() +
  theme(legend.position = "none")

ggsave(filename = "07_volcano_plot.png",
       path = here("results"))
Warning: Width not defined
ℹ Set with `position_dodge(width = ...)`
Warning: ggrepel: 251 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

We observe that probes ILMN_1688780 and ILMN_1739792 are the most differentiated probes, based on the visible distance from the center and high -log10(p.adj) value. In other words, these probes showed significantly lower baseline expression in candidates that proved to be good responders. To find the best predictor we will use a boxplot.

analysis_data_long |> 
  filter(PROBE_ID == c("ILMN_1688780", "ILMN_1739792")) |> 
  ggplot(aes(x= Response_group, y = log2_signal, fill = Response_group)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  geom_boxplot(width = 0.15, fill = "white", outlier.shape = NA) +
  geom_jitter(shape = 16, position = position_jitter(0.2), size = 2, alpha = 0.7) +
  facet_wrap(~ PROBE_ID, scales = "free_y", ncol = 4) +
  coord_cartesian(ylim = c(min(analysis_data_long |> pull(log2_signal), na.rm = TRUE) * 1.1, 
                           max(analysis_data_long |> pull(log2_signal), na.rm = TRUE) * 1.05)) +
  labs(
    title ="Baseline Expression of possible Predictors:\nILMN_1688780 and ILMN_1739792",
    subtitle = "Lower pre-vaccine expression predicts a Good Response",
    x = "Vaccine Antibody Response Group",
    y = expression(log[2]~"Signal (Pre-Vaccine Expression)")
  ) +
  scale_fill_manual(values = c("Good responders" = "turquoise3", "Poor responders" = "salmon")) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

ggsave(filename = "07_boxplot.png",
       path = here("results"))

Based on the final plot, we observe that the most significant differentiation between the two response groups comes from probe ILMN_1688780.